In [ ]:


In [ ]:
!wget https://raw.githubusercontent.com/daniel-acuna/hackathon_syracuse/master/data/RoadRatings2015.csv

In [ ]:
!wget https://raw.githubusercontent.com/daniel-acuna/hackathon_syracuse/master/data/potholes.csv

In [ ]:
!pwd

In [ ]:
roadratings_df = sqlContext.read\
    .format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option("nullValue", 'NA')\
    .load("file:///home/ischool/spark_demo/RoadRatings2015.csv")

In [ ]:
potholes_df = sqlContext.read\
    .format("com.databricks.spark.csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .option('nullValue', 'NA')\
    .load("file:///home/ischool/spark_demo/potholes.csv")

In [ ]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [ ]:
rr_df = roadratings_df.select(['streetID', 'overall', 'crack', 'patch', 'length', 'width'])

In [ ]:
ph_df = potholes_df.select(['STREET_ID', 'Latitude', 'Longitude'])

In [ ]:
from pyspark.sql import functions

In [ ]:
ph_df.groupBy('STREET_ID').agg(functions.count('*').alias('n_potholes')).orderBy(functions.desc('n_potholes')).show()

Activity compute the average number of potholes in the city


In [ ]:
## Your code here

In [ ]:
ph_df.groupBy('STREET_ID').agg(functions.count('*').alias('n_potholes')).orderBy(functions.desc('n_potholes')).\
    join(potholes_df.select(['STREET_ID', 'strLocation']), 'STREET_ID').show()

In [ ]:
rr_df.columns

In [ ]:
dataset_df = rr_df.join(ph_df, rr_df['streetID'] == ph_df['STREET_ID'])

In [ ]:
dataset_df.printSchema()

In [ ]:
lr = LinearRegression(featuresCol='features', labelCol='overall')

In [ ]:
va = VectorAssembler(inputCols=['length'], outputCol='features')

In [ ]:
pl = Pipeline(stages=[va, lr])

In [ ]:
input_dataset_df = dataset_df.select(dataset_df['length'].astype('double').alias('length'),
                         dataset_df['overall'].astype('double').alias('overall')
                        ).dropna()

In [ ]:
training_df, testing_df = input_dataset_df.randomSplit([0.8, 0.2])

In [ ]:
training_df.count()

In [ ]:
testing_df.count()

In [ ]:
pl_fit = pl.fit(training_df)

In [ ]:
pl_fit.transform(training_df).show()

In [ ]:
training_df.count()

In [ ]:
testing_df.count()

In [ ]:
pl_fit.transform(testing_df).show()

Activity: Compute MSE


In [ ]:

Activity: Compare models with different feature sets (length + width + Latitude + Longitude)


In [ ]: